Why we are doing this study?

This is a study case used as a capstone for the Google Data Analytics certificate Problem: My favorite activity in the world is sleeping. Could I optimize it by controlling the different variables that might affect it?

Solution: Let’s dive into a sleeping study to discover which variables will improve or ruin a good night sleep!

Statistical Information

library(tidyverse)
library(ggplot2)
library(patchwork)
data <- read.csv("sleepdata.csv")
head(data)
##   Person.ID Gender Age           Occupation Sleep.Duration Quality.of.Sleep
## 1         1   Male  27    Software Engineer            6.1                6
## 2         2   Male  28               Doctor            6.2                6
## 3         3   Male  28               Doctor            6.2                6
## 4         4   Male  28 Sales Representative            5.9                4
## 5         5   Male  28 Sales Representative            5.9                4
## 6         6   Male  28    Software Engineer            5.9                4
##   Physical.Activity.Level Stress.Level BMI.Category Blood.Pressure Heart.Rate
## 1                      42            6   Overweight         126/83         77
## 2                      60            8       Normal         125/80         75
## 3                      60            8       Normal         125/80         75
## 4                      30            8        Obese         140/90         85
## 5                      30            8        Obese         140/90         85
## 6                      30            8        Obese         140/90         85
##   Daily.Steps Sleep.Disorder
## 1        4200           None
## 2       10000           None
## 3       10000           None
## 4        3000    Sleep Apnea
## 5        3000    Sleep Apnea
## 6        3000       Insomnia
summary(data)
##    Person.ID         Gender               Age         Occupation       
##  Min.   :  1.00   Length:374         Min.   :27.00   Length:374        
##  1st Qu.: 94.25   Class :character   1st Qu.:35.25   Class :character  
##  Median :187.50   Mode  :character   Median :43.00   Mode  :character  
##  Mean   :187.50                      Mean   :42.18                     
##  3rd Qu.:280.75                      3rd Qu.:50.00                     
##  Max.   :374.00                      Max.   :59.00                     
##  Sleep.Duration  Quality.of.Sleep Physical.Activity.Level  Stress.Level  
##  Min.   :5.800   Min.   :4.000    Min.   :30.00           Min.   :3.000  
##  1st Qu.:6.400   1st Qu.:6.000    1st Qu.:45.00           1st Qu.:4.000  
##  Median :7.200   Median :7.000    Median :60.00           Median :5.000  
##  Mean   :7.132   Mean   :7.313    Mean   :59.17           Mean   :5.385  
##  3rd Qu.:7.800   3rd Qu.:8.000    3rd Qu.:75.00           3rd Qu.:7.000  
##  Max.   :8.500   Max.   :9.000    Max.   :90.00           Max.   :8.000  
##  BMI.Category       Blood.Pressure       Heart.Rate     Daily.Steps   
##  Length:374         Length:374         Min.   :65.00   Min.   : 3000  
##  Class :character   Class :character   1st Qu.:68.00   1st Qu.: 5600  
##  Mode  :character   Mode  :character   Median :70.00   Median : 7000  
##                                        Mean   :70.17   Mean   : 6817  
##                                        3rd Qu.:72.00   3rd Qu.: 8000  
##                                        Max.   :86.00   Max.   :10000  
##  Sleep.Disorder    
##  Length:374        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#Number of null values per column
colSums(is.na(data))
##               Person.ID                  Gender                     Age 
##                       0                       0                       0 
##              Occupation          Sleep.Duration        Quality.of.Sleep 
##                       0                       0                       0 
## Physical.Activity.Level            Stress.Level            BMI.Category 
##                       0                       0                       0 
##          Blood.Pressure              Heart.Rate             Daily.Steps 
##                       0                       0                       0 
##          Sleep.Disorder 
##                       0
gen_avg <- data %>% 
  group_by(Gender) %>%
  summarise(mean_age = mean(Age), n = n())

gen_avg
## # A tibble: 2 × 3
##   Gender mean_age     n
##   <chr>     <dbl> <int>
## 1 Female     47.4   185
## 2 Male       37.1   189

The average age of Women is higher. This could skew the data as age could be an important factor in sleep quality. But we do have a 50/50 gender representation.

Exploratory Data Analysis (EDA)

data$Sleep.Disorder = factor(data$Sleep.Disorder, levels = c('None','Insomnia','Sleep Apnea'))

df <- data %>% 
  group_by(Sleep.Disorder) %>% # Variable to be transformed
  count() %>% 
  ungroup() %>% 
  mutate(perc = `n` / sum(`n`)) %>% 
  arrange(perc) %>%
  mutate(labels = scales::percent(perc))

ggplot(df, aes(x = "", y = perc, fill = Sleep.Disorder)) +
  geom_col() +
  geom_label(aes(label = labels), color = "black",
            position = position_stack(vjust = 0.5),
            show.legend = FALSE) +
  scale_fill_brewer(palette = "Reds") +  
  coord_polar("y", start = 0) +  
  theme_void() +
  ggtitle("Percentage of Sleep Disorders") +
  theme(plot.title = element_text(hjust=0.5))

We do not have an equal representation of every sleep class, but we almost have a 60/40 representation of healthy and disease respectively.

data$BMI.Category <- gsub("Normal Weight", "Normal", data$BMI.Category)
data$BMI.Category <- factor(data$BMI.Category, labels=c('Normal','Overweight','Obese'))

bmi_bar <- ggplot(data, aes(x=BMI.Category, fill=Sleep.Disorder)) +
  scale_fill_brewer(palette="Reds") +
  geom_bar() +
  ggtitle("BMI effect on Sleep") +
  theme(plot.title = element_text(hjust=0.5))
bmi_bar

We cleaned up this column by merging “Normal” and “Normal Weight” samples. The overweight category seems severely under represented.

A conclusion we can pull from this figure is that BMI is correlated with sleep quality A higher BMI tends to indicate insomnia and sleep apnea, while in the lowest BMI category has a majority of normal sleep. Obese class has almost 50/50 split on both sleep disorders. We cannot make an conclusions on the Overweight category due to the small sample size but it does seem to indicate a correlation with sleep disorders.

occup_point <- ggplot(data, aes(x=Occupation, y=Stress.Level)) +
  geom_point(aes(fill=Sleep.Disorder,size=Age), color='black', shape=21, stroke=0.4) +
  scale_fill_brewer(palette="Reds") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5),
        plot.margin = margin(t = 5, r = 10, b = 5, l = 10),
        axis.title.x = element_text(margin = margin(t = 20)),
        plot.title = element_text(hjust=0.5)) +
  ggtitle("Occupation effect on Sleep") 

occup_bar <- ggplot(data, aes(x=Occupation, fill=Sleep.Disorder)) +
  geom_bar() +
  scale_fill_brewer(palette="Reds")+
  ggtitle("Occupation, Age and Stress effect on Sleep") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.7),
        plot.margin = margin(t = 5, r = 10, b = 5, l = 10),
        axis.title.x = element_text(margin = margin(t = 20)),
        plot.title = element_text(hjust=0.5)) 

occup_bar + occup_point

BP_ranges = c('Normal', 'Elevated', 'Hypertension_1', 'Hypertension_2', 'Hypertensive_Crisis')
BP_systolic_limits = list(c(0,120),c(120,130),c(130,140),c(140,180),c(180,200))
BP_diastolic_limits = list(c(0,80),c(0,80),c(80,90),c(90,120),c(120,140))

data <- data %>%
  separate(Blood.Pressure, into=c('Systolic','Diastolic'), sep="/") %>%
  mutate(Systolic = as.numeric(Systolic),
         Diastolic = as.numeric(Diastolic)) %>%
  mutate(BP_range = case_when(
    between(Systolic, BP_systolic_limits[[1]][1], BP_systolic_limits[[1]][2]) & 
      between(Diastolic, BP_diastolic_limits[[1]][1], BP_diastolic_limits[[1]][2]) ~ BP_ranges[1],
    
    between(Systolic, BP_systolic_limits[[2]][1], BP_systolic_limits[[2]][2]) & 
      between(Diastolic, BP_diastolic_limits[[2]][1], BP_diastolic_limits[[2]][2]) ~ BP_ranges[2],
    
    between(Systolic, BP_systolic_limits[[3]][1], BP_systolic_limits[[3]][2]) & 
      between(Diastolic, BP_diastolic_limits[[3]][1], BP_diastolic_limits[[3]][2]) ~ BP_ranges[3],
    
    between(Systolic, BP_systolic_limits[[4]][1], BP_systolic_limits[[4]][2]) & 
      between(Diastolic, BP_diastolic_limits[[4]][1], BP_diastolic_limits[[4]][2]) ~ BP_ranges[4],
    
    between(Systolic, BP_systolic_limits[[5]][1], BP_systolic_limits[[5]][2]) & 
      between(Diastolic, BP_diastolic_limits[[5]][1], BP_diastolic_limits[[5]][2]) ~ BP_ranges[5],
    
    TRUE ~ "Unknown" # Default case
  ))


# New values to replace "Unknown"
replacement_values <- c(rep("Elevated", 12), rep("Hypertension_1", 2), "Elevated")

data <- data %>%
  mutate(BP_range = replace(BP_range, BP_range == "Unknown", replacement_values)) %>%
  mutate(Heart.Rate = as.factor(Heart.Rate))

data$Heart.Rate <- as.numeric(as.character(data$Heart.Rate))
# Group by BP_range and Heart.Rate, then count occurrences
data_grouped <- data %>%
  group_by(BP_range, Heart.Rate, Sleep.Disorder, Age) %>%
  summarise(Count = n(), .groups = "drop")  # Count occurrences of each Heart Rate

# Convert BP_range into a factor with the specified order
data_grouped$BP_range <- factor(data_grouped$BP_range, levels = BP_ranges)

blood_bar <- ggplot(data_grouped, aes(x = BP_range, y = Count, fill = Heart.Rate)) +
  geom_col() +  
  scale_fill_viridis_c(option = "magma", direction = -1) +  
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

blood_bar

We separated the “Blood.Pressure” column into “Systolic” (#/) and “Diastolic” (/#) categories and then depending on their combination create a categorical blood pressure range column: “BP_range” from the AHA . We manually clean up the “Unknown” ranges by making assumptions (putting more importance on the systolic number) and assigning them a category.

blood_sleep <- ggplot(data_grouped, aes(x=BP_range, y=Heart.Rate)) + 
  geom_point(aes(fill=Sleep.Disorder,size=Age), color='black', shape=21, stroke=0.4 ) + 
  scale_fill_brewer(palette="Reds") 

blood_sleep

data <- data %>% 
  mutate(Stress.Level = as.factor(Stress.Level))

act_bar <- ggplot(data, aes(x=Physical.Activity.Level,y=Daily.Steps)) +
  geom_point(aes(fill=Stress.Level, size=Age), color='black', shape=21, stroke=0.4) +
  scale_fill_brewer(palette="Reds") +
  geom_smooth(method=lm,  linetype="dashed",
              color="darkred")

act_bar

sleep_q <- ggplot(data, aes(x=Sleep.Duration,y=Quality.of.Sleep)) +
  geom_point(aes(fill=Sleep.Disorder), color='black', shape=21, stroke=0.4, size=3) +  
  scale_fill_brewer(palette="Reds")

sleep_q

gend_group <- data %>%
  group_by(Gender,Sleep.Disorder,Age)

gender_bar <- ggplot(data, aes(x=Gender, y=Age, fill=Sleep.Disorder)) +
  scale_fill_brewer(palette="Reds") +
  geom_bar(stat="summary", fun.y = "mean", position="dodge")

gender_bar 

Conclusions

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.